Binary Model - 1 = Good, 0 = Bad
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
Data Preprocessing¶
# Function to create dataframe from JSON file
def create_dataframe_from_json(filename):
data = []
with open(filename, 'r', encoding='utf-8') as file:
for line in file:
data.append(json.loads(line))
return pd.DataFrame(data)
review_df = create_dataframe_from_json('C:/Users/ndhu2/Desktop/Term_3/6_Natural Language Processing/Project/yelp_dataset/yelp_academic_dataset_review.json')
business_df = create_dataframe_from_json('C:/Users/ndhu2/Desktop/Term_3/6_Natural Language Processing/Project/yelp_dataset/yelp_academic_dataset_business.json')
# Merge the two dataframes on the 'business_id' column
merged_df = pd.merge(review_df, business_df, on='business_id', how='left')
# Check the resulting dataframe
print(merged_df.head(10))
review_id user_id business_id \
0 KU_O5udG6zpxOg-VcAEodg mh_-eMZ6K5RLWhZyISBhwA XQfwVwDr-v0ZS3_CbbE5Xw
1 BiTunyQ73aT9WBnpR9DZGw OyoGAe7OKpv6SyGZT5g77Q 7ATYjTIgM3jUlt4UM3IypQ
2 saUsX_uimxRlCVr67Z4Jig 8g_iMtfSiwikVnbP2etR0A YjUWPpI6HXG530lwP-fb2A
3 AqPFMleE6RsU23_auESxiA _7bHUi9Uuf5__HHc_Q8guQ kxX2SOes4o-D3ZQBkiMRfA
4 Sx8TMOWLNuJBWer-0pcmoA bcjbaE6dDog4jkNY91ncLQ e4Vwtrqf-wpJfwesgvdgxQ
5 JrIxlS1TzJ-iCu79ul40cQ eUta8W_HdHMXPzLBBZhL1A 04UD14gamNjLY0IDYVhHJg
6 6AxgBCNX_PNTOxmbRSwcKQ r3zeYsv1XFBRA4dJpL78cw gmjsEdUsKpj9Xxu6pdjH0g
7 _ZeMknuYdlQcUqng_Im3yg yfFzsLmaWF2d4Sr0UNbBgg LHSTtnW3YHCeUkRDGyJOyw
8 ZKvDG2sBvHVdF5oBNUOpAQ wSTuiTk-sKNdcFyprzZAjg B5XSoSG3SfvQGtKEGQ1tSQ
9 pUycOfUwM8vqX7KjRRhUEA 59MxRhNVhU9MYndMkz0wtw gebiRewfieSdtt17PTW6Zg
stars_x useful funny cool \
0 3.0 0 0 0
1 5.0 1 0 1
2 3.0 0 0 0
3 5.0 1 0 1
4 4.0 1 0 1
5 1.0 1 2 1
6 5.0 0 2 0
7 5.0 2 0 0
8 3.0 1 1 0
9 3.0 0 0 0
text date \
0 If you decide to eat here, just be aware it is... 2018-07-07 22:09:11
1 I've taken a lot of spin classes over the year... 2012-01-03 15:28:18
2 Family diner. Had the buffet. Eclectic assortm... 2014-02-05 20:30:30
3 Wow! Yummy, different, delicious. Our favo... 2015-01-04 00:01:03
4 Cute interior and owner (?) gave us tour of up... 2017-01-14 20:54:15
5 I am a long term frequent customer of this est... 2015-09-23 23:10:31
6 Loved this tour! I grabbed a groupon and the p... 2015-01-03 23:21:18
7 Amazingly amazing wings and homemade bleu chee... 2015-08-07 02:29:16
8 This easter instead of going to Lopez Lake we ... 2016-03-30 22:46:33
9 Had a party of 6 here for hibachi. Our waitres... 2016-07-25 07:31:06
name ... state postal_code latitude \
0 Turning Point of North Wales ... PA 19454 40.210196
1 Body Cycle Spinning Studio ... PA 19119 39.952103
2 Kettle Restaurant ... AZ 85713 32.207233
3 Zaika ... PA 19114 40.079848
4 Melt ... LA 70119 29.962102
5 Dmitri's ... PA 19147 39.938013
6 The Voodoo Bone Lady Tours ... LA 70170 29.952030
7 Fries Rebellion ... PA 18951 40.407537
8 Los Padres National Forest ... CA 93105 34.597239
9 Hibachi Steak House & Sushi Bar ... CA 93101 34.416984
longitude stars_y review_count is_open \
0 -75.223639 3.0 169 1
1 -75.172753 5.0 144 0
2 -110.980864 3.5 47 1
3 -75.025080 4.0 181 1
4 -90.087958 4.0 32 0
5 -75.148131 4.0 273 0
6 -90.070334 4.5 359 1
7 -75.338825 3.5 103 0
8 -119.510772 4.5 13 1
9 -119.695556 3.5 488 1
attributes \
0 {'NoiseLevel': 'u'average'', 'HasTV': 'False',...
1 {'BusinessAcceptsCreditCards': 'True', 'GoodFo...
2 {'RestaurantsReservations': 'True', 'BusinessP...
3 {'Caters': 'True', 'Ambience': '{'romantic': F...
4 {'BusinessParking': '{'garage': False, 'street...
5 {'BusinessParking': '{'garage': False, 'street...
6 {'GoodForKids': 'True'}
7 {'RestaurantsAttire': ''casual'', 'Ambience': ...
8 {'GoodForKids': 'True', 'BikeParking': 'True',...
9 {'Corkage': 'False', 'RestaurantsTakeOut': 'Tr...
categories \
0 Restaurants, Breakfast & Brunch, Food, Juice B...
1 Active Life, Cycling Classes, Trainers, Gyms, ...
2 Restaurants, Breakfast & Brunch
3 Halal, Pakistani, Restaurants, Indian
4 Sandwiches, Beer, Wine & Spirits, Bars, Food, ...
5 Mediterranean, Restaurants, Seafood, Greek
6 Supernatural Readings, Tours, Hotels & Travel,...
7 Beer Bar, Bars, American (New), Gastropubs, Re...
8 Parks, Active Life
9 Steakhouses, Sushi Bars, Restaurants, Japanese
hours
0 {'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'...
1 {'Monday': '6:30-20:30', 'Tuesday': '6:30-20:3...
2 None
3 {'Tuesday': '11:0-21:0', 'Wednesday': '11:0-21...
4 {'Monday': '0:0-0:0', 'Friday': '11:0-17:0', '...
5 {'Wednesday': '17:30-21:0', 'Thursday': '17:30...
6 {'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...
7 {'Wednesday': '11:0-23:0', 'Thursday': '11:0-2...
8 None
9 {'Monday': '0:0-0:0'}
[10 rows x 22 columns]
merged_df.shape
(6990280, 22)
merged_df.dtypes
review_id object user_id object business_id object stars_x float64 useful int64 funny int64 cool int64 text object date object name object address object city object state object postal_code object latitude float64 longitude float64 stars_y float64 review_count int64 is_open int64 attributes object categories object hours object dtype: object
merged_df.head(10)
| review_id | user_id | business_id | stars_x | useful | funny | cool | text | date | name | ... | state | postal_code | latitude | longitude | stars_y | review_count | is_open | attributes | categories | hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | KU_O5udG6zpxOg-VcAEodg | mh_-eMZ6K5RLWhZyISBhwA | XQfwVwDr-v0ZS3_CbbE5Xw | 3.0 | 0 | 0 | 0 | If you decide to eat here, just be aware it is... | 2018-07-07 22:09:11 | Turning Point of North Wales | ... | PA | 19454 | 40.210196 | -75.223639 | 3.0 | 169 | 1 | {'NoiseLevel': 'u'average'', 'HasTV': 'False',... | Restaurants, Breakfast & Brunch, Food, Juice B... | {'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'... |
| 1 | BiTunyQ73aT9WBnpR9DZGw | OyoGAe7OKpv6SyGZT5g77Q | 7ATYjTIgM3jUlt4UM3IypQ | 5.0 | 1 | 0 | 1 | I've taken a lot of spin classes over the year... | 2012-01-03 15:28:18 | Body Cycle Spinning Studio | ... | PA | 19119 | 39.952103 | -75.172753 | 5.0 | 144 | 0 | {'BusinessAcceptsCreditCards': 'True', 'GoodFo... | Active Life, Cycling Classes, Trainers, Gyms, ... | {'Monday': '6:30-20:30', 'Tuesday': '6:30-20:3... |
| 2 | saUsX_uimxRlCVr67Z4Jig | 8g_iMtfSiwikVnbP2etR0A | YjUWPpI6HXG530lwP-fb2A | 3.0 | 0 | 0 | 0 | Family diner. Had the buffet. Eclectic assortm... | 2014-02-05 20:30:30 | Kettle Restaurant | ... | AZ | 85713 | 32.207233 | -110.980864 | 3.5 | 47 | 1 | {'RestaurantsReservations': 'True', 'BusinessP... | Restaurants, Breakfast & Brunch | None |
| 3 | AqPFMleE6RsU23_auESxiA | _7bHUi9Uuf5__HHc_Q8guQ | kxX2SOes4o-D3ZQBkiMRfA | 5.0 | 1 | 0 | 1 | Wow! Yummy, different, delicious. Our favo... | 2015-01-04 00:01:03 | Zaika | ... | PA | 19114 | 40.079848 | -75.025080 | 4.0 | 181 | 1 | {'Caters': 'True', 'Ambience': '{'romantic': F... | Halal, Pakistani, Restaurants, Indian | {'Tuesday': '11:0-21:0', 'Wednesday': '11:0-21... |
| 4 | Sx8TMOWLNuJBWer-0pcmoA | bcjbaE6dDog4jkNY91ncLQ | e4Vwtrqf-wpJfwesgvdgxQ | 4.0 | 1 | 0 | 1 | Cute interior and owner (?) gave us tour of up... | 2017-01-14 20:54:15 | Melt | ... | LA | 70119 | 29.962102 | -90.087958 | 4.0 | 32 | 0 | {'BusinessParking': '{'garage': False, 'street... | Sandwiches, Beer, Wine & Spirits, Bars, Food, ... | {'Monday': '0:0-0:0', 'Friday': '11:0-17:0', '... |
| 5 | JrIxlS1TzJ-iCu79ul40cQ | eUta8W_HdHMXPzLBBZhL1A | 04UD14gamNjLY0IDYVhHJg | 1.0 | 1 | 2 | 1 | I am a long term frequent customer of this est... | 2015-09-23 23:10:31 | Dmitri's | ... | PA | 19147 | 39.938013 | -75.148131 | 4.0 | 273 | 0 | {'BusinessParking': '{'garage': False, 'street... | Mediterranean, Restaurants, Seafood, Greek | {'Wednesday': '17:30-21:0', 'Thursday': '17:30... |
| 6 | 6AxgBCNX_PNTOxmbRSwcKQ | r3zeYsv1XFBRA4dJpL78cw | gmjsEdUsKpj9Xxu6pdjH0g | 5.0 | 0 | 2 | 0 | Loved this tour! I grabbed a groupon and the p... | 2015-01-03 23:21:18 | The Voodoo Bone Lady Tours | ... | LA | 70170 | 29.952030 | -90.070334 | 4.5 | 359 | 1 | {'GoodForKids': 'True'} | Supernatural Readings, Tours, Hotels & Travel,... | {'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'... |
| 7 | _ZeMknuYdlQcUqng_Im3yg | yfFzsLmaWF2d4Sr0UNbBgg | LHSTtnW3YHCeUkRDGyJOyw | 5.0 | 2 | 0 | 0 | Amazingly amazing wings and homemade bleu chee... | 2015-08-07 02:29:16 | Fries Rebellion | ... | PA | 18951 | 40.407537 | -75.338825 | 3.5 | 103 | 0 | {'RestaurantsAttire': ''casual'', 'Ambience': ... | Beer Bar, Bars, American (New), Gastropubs, Re... | {'Wednesday': '11:0-23:0', 'Thursday': '11:0-2... |
| 8 | ZKvDG2sBvHVdF5oBNUOpAQ | wSTuiTk-sKNdcFyprzZAjg | B5XSoSG3SfvQGtKEGQ1tSQ | 3.0 | 1 | 1 | 0 | This easter instead of going to Lopez Lake we ... | 2016-03-30 22:46:33 | Los Padres National Forest | ... | CA | 93105 | 34.597239 | -119.510772 | 4.5 | 13 | 1 | {'GoodForKids': 'True', 'BikeParking': 'True',... | Parks, Active Life | None |
| 9 | pUycOfUwM8vqX7KjRRhUEA | 59MxRhNVhU9MYndMkz0wtw | gebiRewfieSdtt17PTW6Zg | 3.0 | 0 | 0 | 0 | Had a party of 6 here for hibachi. Our waitres... | 2016-07-25 07:31:06 | Hibachi Steak House & Sushi Bar | ... | CA | 93101 | 34.416984 | -119.695556 | 3.5 | 488 | 1 | {'Corkage': 'False', 'RestaurantsTakeOut': 'Tr... | Steakhouses, Sushi Bars, Restaurants, Japanese | {'Monday': '0:0-0:0'} |
10 rows × 22 columns
# Drop unnecessary features
df = merged_df[['text', 'stars_x', 'categories','state','name']]
# Check the resulting dataframe
df.head(10)
| text | stars_x | categories | state | name | |
|---|---|---|---|---|---|
| 0 | If you decide to eat here, just be aware it is... | 3.0 | Restaurants, Breakfast & Brunch, Food, Juice B... | PA | Turning Point of North Wales |
| 1 | I've taken a lot of spin classes over the year... | 5.0 | Active Life, Cycling Classes, Trainers, Gyms, ... | PA | Body Cycle Spinning Studio |
| 2 | Family diner. Had the buffet. Eclectic assortm... | 3.0 | Restaurants, Breakfast & Brunch | AZ | Kettle Restaurant |
| 3 | Wow! Yummy, different, delicious. Our favo... | 5.0 | Halal, Pakistani, Restaurants, Indian | PA | Zaika |
| 4 | Cute interior and owner (?) gave us tour of up... | 4.0 | Sandwiches, Beer, Wine & Spirits, Bars, Food, ... | LA | Melt |
| 5 | I am a long term frequent customer of this est... | 1.0 | Mediterranean, Restaurants, Seafood, Greek | PA | Dmitri's |
| 6 | Loved this tour! I grabbed a groupon and the p... | 5.0 | Supernatural Readings, Tours, Hotels & Travel,... | LA | The Voodoo Bone Lady Tours |
| 7 | Amazingly amazing wings and homemade bleu chee... | 5.0 | Beer Bar, Bars, American (New), Gastropubs, Re... | PA | Fries Rebellion |
| 8 | This easter instead of going to Lopez Lake we ... | 3.0 | Parks, Active Life | CA | Los Padres National Forest |
| 9 | Had a party of 6 here for hibachi. Our waitres... | 3.0 | Steakhouses, Sushi Bars, Restaurants, Japanese | CA | Hibachi Steak House & Sushi Bar |
# Display the number of records with missing values
missing_values_count = df.isnull().sum()
print("Number of records with missing values:")
print(missing_values_count)
# Display the number of duplicate records
duplicate_records_count = df.duplicated().sum()
print("\nNumber of duplicate records:")
print(duplicate_records_count)
# Display the number of records before any cleaning
total_records_before_cleaning = len(df)
print("\nTotal number of records before cleaning:", total_records_before_cleaning)
Number of records with missing values: text 0 stars_x 0 categories 689 state 0 name 0 dtype: int64 Number of duplicate records: 12007 Total number of records before cleaning: 6990280
# Find duplicate records
duplicate_mask = df.duplicated(keep=False)
duplicate_records = df[duplicate_mask]
duplicate_records.head(20)
| text | stars_x | categories | state | name | |
|---|---|---|---|---|---|
| 749 | I've always had good experiences here. The foo... | 4.0 | Restaurants, American (New) | AZ | Nox Kitchen + Cocktails |
| 1283 | Great food... Love the Fish tacos & Nachos are... | 5.0 | Seafood, American (Traditional), Music Venues,... | FL | Sam's Beach Bar |
| 2244 | We were back last night for the second time: a... | 5.0 | Restaurants, Nightlife, Japanese, Bars | CA | Yoichi's |
| 2855 | I ate like a Queen! A breakfeast Queen- and I... | 5.0 | Restaurants, Southern, Cajun/Creole, Seafood, ... | LA | Mena's Palace |
| 3311 | We are visiting in Philadelphia from Californi... | 5.0 | Active Life, Beer Gardens, Grocery, Middle Eas... | PA | Suraya |
| 3572 | While I was moving in and had all my stuff on ... | 1.0 | Home Services, Real Estate, Apartments | FL | Egret's Landing Apartments |
| 4094 | I have no idea what these people are talking a... | 3.0 | Restaurants, New Mexican Cuisine, Mexican | LA | Burritos Grill Mexican Fresh Cuisine |
| 4391 | This Wendy's is a good location. Open 24 hours... | 4.0 | Hot Dogs, American (New), Food, Burgers, Fast ... | NJ | Wendy's |
| 4673 | The person next door reported a leak coming fr... | 5.0 | Home Services, Plumbing | FL | Tom Shell Plumbing |
| 4873 | The wash was very average and very little atte... | 3.0 | Auto Detailing, Car Wash, Automotive | FL | Mike's Auto Detailing |
| 5478 | Awesome food...everything was fresh and made t... | 5.0 | Japanese, Korean, Asian Fusion, Tacos, Mexican... | TN | Soy Bistro |
| 5542 | Stopped in here with my wife and a friend for ... | 1.0 | Fast Food, Event Planning & Services, Restaura... | IN | QDOBA Mexican Eats |
| 5960 | We went with the simple choice and were not di... | 5.0 | Restaurants, Sandwiches, Food, Breakfast & Bru... | TN | The Pancake Pantry |
| 6006 | love this place! The owners are 3 brothers rig... | 5.0 | Pizza, Restaurants | PA | Spatola's Pizza |
| 6306 | Beware of smiling Joe! We bought living room f... | 1.0 | Home & Garden, Home Decor, Furniture Stores, S... | FL | Ashley HomeStore |
| 6424 | I brought my cousin here on Christmas night. T... | 1.0 | Bars, Nightlife, American (Traditional), Music... | NV | Brew Brothers |
| 6689 | We walked into this bar last night - December ... | 5.0 | Restaurants, Tiki Bars, Mexican, Bars, Cocktai... | LA | Tiki Tolteca |
| 7001 | Fabulous!!! My family orders from here all the... | 5.0 | Seafood, Pizza, Restaurants, Italian | PA | Main Street Pizzeria & Grille |
| 7021 | Diner food does not get any better than this!!... | 5.0 | Restaurants, Diners, Breakfast & Brunch | FL | Pop N Sons Diner |
| 7522 | Every time I go there the service is terrible ... | 1.0 | Barbeque, Restaurants, American (Traditional),... | LA | Dickey's Barbecue Pit |
After identifying that the duplicated records appear to be regular or normal entries, we concluded that their presence is likely the result of an error during data handling or uploading. Given that they do not provide any additional information, keeping them in the dataset is unlikely to have a positive impact on model performance. Therefore, we plan to remove these duplicate records as part of the data cleaning process.
# Drop rows with missing values
df.dropna(inplace=True)
# Remove duplicates
df.drop_duplicates(inplace=True)
# Display the number of records after cleaning
total_records_after_cleaning = len(df)
print("\nTotal number of records after cleaning:", total_records_after_cleaning)
C:\Users\ndhu2\AppData\Local\Temp\ipykernel_20132\611006536.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df.dropna(inplace=True)
Total number of records after cleaning: 6977585
C:\Users\ndhu2\AppData\Local\Temp\ipykernel_20132\611006536.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df.drop_duplicates(inplace=True)
Exploratory Data Analysis¶
df.describe(include='all')
| text | stars_x | categories | state | name | |
|---|---|---|---|---|---|
| count | 6977585 | 6.977585e+06 | 6977585 | 6977585 | 6977585 |
| unique | 6973440 | NaN | 83160 | 27 | 114026 |
| top | DO NOT PARK HERE!\nthey are too quick to boot ... | NaN | Mexican, Restaurants | PA | Starbucks |
| freq | 18 | NaN | 54841 | 1596380 | 21532 |
| mean | NaN | 3.748519e+00 | NaN | NaN | NaN |
| std | NaN | 1.478515e+00 | NaN | NaN | NaN |
| min | NaN | 1.000000e+00 | NaN | NaN | NaN |
| 25% | NaN | 3.000000e+00 | NaN | NaN | NaN |
| 50% | NaN | 4.000000e+00 | NaN | NaN | NaN |
| 75% | NaN | 5.000000e+00 | NaN | NaN | NaN |
| max | NaN | 5.000000e+00 | NaN | NaN | NaN |
# Stars Distribution Analysis
df['stars_x'].hist(bins=20, figsize=(8, 6))
plt.title('Distribution of Star Ratings')
plt.xlabel('Star Rating')
plt.ylabel('Frequency')
plt.show()
As the dataset exhibits an uneven distribution of star ratings, developing a model to predict or identify the sentiment of reviews poses a challenge. The significant difference between the majority class (5 stars) and the minority class (2 stars) may introduce bias during model training. To mitigate this issue, we plan to implement undersampling after splitting the data into training and test datasets, leveraging the abundance of available data.
# Save the DataFrame as a CSV file
df.to_csv('yelp_data.csv', index=False)
new_df = pd.read_csv('yelp_data.csv')
new_df.head()
| text | stars_x | categories | state | name | |
|---|---|---|---|---|---|
| 0 | If you decide to eat here, just be aware it is... | 3.0 | Restaurants, Breakfast & Brunch, Food, Juice B... | PA | Turning Point of North Wales |
| 1 | I've taken a lot of spin classes over the year... | 5.0 | Active Life, Cycling Classes, Trainers, Gyms, ... | PA | Body Cycle Spinning Studio |
| 2 | Family diner. Had the buffet. Eclectic assortm... | 3.0 | Restaurants, Breakfast & Brunch | AZ | Kettle Restaurant |
| 3 | Wow! Yummy, different, delicious. Our favo... | 5.0 | Halal, Pakistani, Restaurants, Indian | PA | Zaika |
| 4 | Cute interior and owner (?) gave us tour of up... | 4.0 | Sandwiches, Beer, Wine & Spirits, Bars, Food, ... | LA | Melt |
new_df.shape
(6977585, 5)
new_df.dtypes
text object stars_x float64 categories object state object name object dtype: object
# Create a new feature containing the length of the reviews
new_df['review_length'] = new_df['text'].apply(len)
new_df.head()
| text | stars_x | categories | state | name | review_length | |
|---|---|---|---|---|---|---|
| 0 | If you decide to eat here, just be aware it is... | 3.0 | Restaurants, Breakfast & Brunch, Food, Juice B... | PA | Turning Point of North Wales | 513 |
| 1 | I've taken a lot of spin classes over the year... | 5.0 | Active Life, Cycling Classes, Trainers, Gyms, ... | PA | Body Cycle Spinning Studio | 829 |
| 2 | Family diner. Had the buffet. Eclectic assortm... | 3.0 | Restaurants, Breakfast & Brunch | AZ | Kettle Restaurant | 339 |
| 3 | Wow! Yummy, different, delicious. Our favo... | 5.0 | Halal, Pakistani, Restaurants, Indian | PA | Zaika | 243 |
| 4 | Cute interior and owner (?) gave us tour of up... | 4.0 | Sandwiches, Beer, Wine & Spirits, Bars, Food, ... | LA | Melt | 534 |
Text Preprocessing¶
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\ndhu2\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\ndhu2\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\ndhu2\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
True
We decided to handle contractions in our dataset to reduce computational power requirements by reducing the dimensionality of the data and unify the impact of previously contracted words.
import re
# Dictionary of English contractions
contractions_dict = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there had",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'alls": "you alls",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}
# Function to expand contractions
def expand_contractions(text, contractions_dict):
# Regular expression for finding contractions
contractions_pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b', flags=re.IGNORECASE)
# Preprocess the keys of contractions_dict to lowercase
processed_dict = {key.lower(): value for key, value in contractions_dict.items()}
def expand_match(contraction):
match = contraction.group(0)
expanded_contraction = processed_dict.get(match.lower(), match)
return expanded_contraction
expanded_text = contractions_pattern.sub(expand_match, text)
return expanded_text
# Previous text preprocessisng
"""import unicodedata
# Initialize WordNet Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Function for text preprocessing
def preprocess_text(text):
# Convert text to lowercase
text = text.lower()
# Normalize text (Remove accents, diacritics, etc.)
text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
# Expand contractions
text = expand_contractions(text, contractions_dict)
# Tokenization
tokens = word_tokenize(text)
# Remove punctuation
tokens = [token for token in tokens if token not in string.punctuation]
# Remove stopwords
tokens = [token for token in tokens if token not in stop_words]
# Lemmatization
tokens = [lemmatizer.lemmatize(token) for token in tokens]
# Join tokens back into text
preprocessed_text = ' '.join(tokens)
return preprocessed_text
# Apply text preprocessing to the 'text' column
new_df['preprocessed_text'] = new_df['text'].apply(preprocess_text)
"""
import re
import unicodedata
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
# Function for text preprocessing
def preprocess_text(text):
# Initialize WordNet Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Convert text to lowercase
text = text.lower()
# Remove URLs using regular expression
text = re.sub(r'http\S+', '', text)
# Normalize text (Remove accents, diacritics, etc.)
text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
# Expand contractions
text = expand_contractions(text, contractions_dict)
# Tokenization
tokens = word_tokenize(text)
# Initialize a variable to keep track of whether the current token follows a negation
following_negation = False
# Iterate over tokens and modify sentiment of words that follow negations
for i in range(len(tokens)):
token = tokens[i]
if token in ["not", "no"]:
following_negation = True
elif following_negation:
# Modify sentiment of the word
tokens[i] = "not_" + tokens[i]
following_negation = False
# Remove punctuation
tokens = [token for token in tokens if token not in string.punctuation]
# Remove stopwords
tokens = [token for token in tokens if token not in stop_words]
# Remove non-alphanumeric characters except for words starting with 'not_'
tokens = [token for token in tokens if token.isalnum() or token.startswith('not_')]
# Whitespace removal
tokens = [token.strip() for token in tokens if token is not None]
# Lemmatization
tokens = [lemmatizer.lemmatize(token) for token in tokens]
# Join tokens back into text
preprocessed_text = ' '.join(tokens)
return preprocessed_text
# Test text containing "didn't"
test_text = "I didn't like it. It wasn't good at all. But at the end, it wasn't that bad"
# Preprocess the test text
preprocessed_test_text = preprocess_text(test_text)
# Display the preprocessed test text
print("Preprocessed test text:", preprocessed_test_text)
Preprocessed test text: not_like not_good end not_that bad
# Apply text preprocessing to the 'text' column
new_df['preprocessed_text'] = new_df['text'].apply(preprocess_text)
"""import unicodedata
# Initialize WordNet Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Function for text preprocessing
def preprocess_text(text):
# Convert text to lowercase
text = text.lower()
# Expand contractions
text = expand_contractions(text, contractions_dict)
# Normalize text (Remove accents, diacritics, etc.)
text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
# Tokenization
tokens = word_tokenize(text)
# Remove punctuation
tokens = [token for token in tokens if token not in string.punctuation]
# Remove stopwords
tokens = [token for token in tokens if token not in stop_words]
# Lemmatization
tokens = [lemmatizer.lemmatize(token) for token in tokens]
# Join tokens back into text
preprocessed_text = ' '.join(tokens)
return preprocessed_text
# Apply text preprocessing to the 'text' column
sample_df = pd.DataFrame()
# Take the first 20 records of the 'text' column from new_df
texts_to_process = new_df['text'].iloc[:20]
# Apply text preprocessing to the selected texts and assign it to the new column 'preprocessed_text'
sample_df['preprocessed_text'] = texts_to_process.apply(preprocess_text)
# Display the resulting DataFrame
print(sample_df)"""
preprocessed_text 0 decide eat aware going take 2 hour beginning e... 1 I taken lot spin class year nothing compare cl... 2 family diner buffet eclectic assortment large ... 3 wow yummy different delicious favorite lamb cu... 4 cute interior owner gave u tour upcoming patio... 5 long term frequent customer establishment went... 6 loved tour grabbed groupon price great perfect... 7 amazingly amazing wing homemade bleu cheese ri... 8 easter instead going lopez lake went los padre... 9 party 6 hibachi waitress brought separate sush... 10 experience shalimar nothing wonderful wanted g... 11 local recommended milktooth amazing jewel indi... 12 love going happy hour dinner great patio fan b... 13 good food -- loved gnocchi marinara baked eggp... 14 bun make sonoran dog like snuggie pup first se... 15 great place breakfast waffle fluffy perfect ho... 16 tremendous service big shout douglas complemen... 17 hubby multiple occasion loved every part meal ... 18 go blow bar get brow done natalie brow special... 19 absolute favorite cafe city black white latte ...
"""# Initialize WordNet Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
# Convert text to lowercase
text = text.lower()
# Remove URLs using regular expression
text = re.sub(r'http\S+', '', text)
# Remove non-alphanumeric characters, but keep numbers
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
# Tokenization
tokens = word_tokenize(text)
# Remove stopwords
tokens = [token for token in tokens if token not in stop_words]
# Lemmatization
tokens = [lemmatizer.lemmatize(token) for token in tokens]
# Join tokens back into text
preprocessed_text = ' '.join(tokens)
return preprocessed_text
# Apply text preprocessing to the 'text' column
new_df['preprocessed_text'] = new_df['text'].apply(preprocess_text)"""
We decided to implement Lemmatization instead of Stemming as this one tends to give more accurate linguistic results, which can lead to a better model performance.
Data Engineering¶
new_df.head(50)
| text | stars_x | categories | state | name | review_length | preprocessed_text | |
|---|---|---|---|---|---|---|---|
| 0 | If you decide to eat here, just be aware it is... | 3.0 | Restaurants, Breakfast & Brunch, Food, Juice B... | PA | Turning Point of North Wales | 513 | decide eat aware going take 2 hour beginning e... |
| 1 | I've taken a lot of spin classes over the year... | 5.0 | Active Life, Cycling Classes, Trainers, Gyms, ... | PA | Body Cycle Spinning Studio | 829 | I taken lot spin class year nothing compare cl... |
| 2 | Family diner. Had the buffet. Eclectic assortm... | 3.0 | Restaurants, Breakfast & Brunch | AZ | Kettle Restaurant | 339 | family diner buffet eclectic assortment large ... |
| 3 | Wow! Yummy, different, delicious. Our favo... | 5.0 | Halal, Pakistani, Restaurants, Indian | PA | Zaika | 243 | wow yummy different delicious favorite lamb cu... |
| 4 | Cute interior and owner (?) gave us tour of up... | 4.0 | Sandwiches, Beer, Wine & Spirits, Bars, Food, ... | LA | Melt | 534 | cute interior owner gave u tour upcoming area ... |
| 5 | I am a long term frequent customer of this est... | 1.0 | Mediterranean, Restaurants, Seafood, Greek | PA | Dmitri's | 341 | long term frequent customer establishment went... |
| 6 | Loved this tour! I grabbed a groupon and the p... | 5.0 | Supernatural Readings, Tours, Hotels & Travel,... | LA | The Voodoo Bone Lady Tours | 804 | loved tour grabbed groupon price great perfect... |
| 7 | Amazingly amazing wings and homemade bleu chee... | 5.0 | Beer Bar, Bars, American (New), Gastropubs, Re... | PA | Fries Rebellion | 192 | amazingly amazing wing homemade bleu cheese ri... |
| 8 | This easter instead of going to Lopez Lake we ... | 3.0 | Parks, Active Life | CA | Los Padres National Forest | 526 | easter instead going lopez lake went los padre... |
| 9 | Had a party of 6 here for hibachi. Our waitres... | 3.0 | Steakhouses, Sushi Bars, Restaurants, Japanese | CA | Hibachi Steak House & Sushi Bar | 524 | party 6 hibachi waitress brought separate sush... |
| 10 | My experience with Shalimar was nothing but wo... | 5.0 | Shopping, Jewelry | FL | Shalimar Fine Jewelers | 1009 | experience shalimar nothing wonderful wanted g... |
| 11 | Locals recommended Milktooth, and it's an amaz... | 4.0 | Beer, Wine & Spirits, Cafes, Coffee & Tea, Res... | IN | Milktooth | 119 | local recommended milktooth amazing jewel indi... |
| 12 | Love going here for happy hour or dinner! Gre... | 4.0 | Bars, Pizza, Nightlife, Cocktail Bars, Italian... | MO | Brio Italian Grille | 242 | love going happy hour dinner great patio fan b... |
| 13 | Good food--loved the gnocchi with marinara\nth... | 4.0 | Pizza, Restaurants, Italian, Salad | PA | LaScala's | 175 | good food loved gnocchi marinara baked eggplan... |
| 14 | The bun makes the Sonoran Dog. It's like a snu... | 4.0 | Restaurants, Tacos, Mexican, Hot Dogs, Breakfa... | AZ | BK Tacos | 658 | bun make sonoran dog like snuggie pup first se... |
| 15 | Great place for breakfast! I had the waffle, w... | 5.0 | Sandwiches, Restaurants, American (New), Ameri... | FL | Mamas Kitchen | 175 | great place breakfast waffle fluffy perfect ho... |
| 16 | Tremendous service (Big shout out to Douglas) ... | 5.0 | Wine Bars, Restaurants, Nightlife, Steakhouses... | PA | Rittenhouse Grill | 276 | tremendous service big shout douglas complemen... |
| 17 | The hubby and I have been here on multiple occ... | 4.0 | Wine Bars, Bars, Nightlife, American (New), Me... | MO | Olio | 577 | hubby multiple occasion loved every part meal ... |
| 18 | I go to blow bar to get my brows done by natal... | 5.0 | Makeup Artists, Blow Dry/Out Services, Beauty ... | FL | Blow Bar Express Styling Salon | 393 | go blow bar get brow done natalie brow special... |
| 19 | My absolute favorite cafe in the city. Their b... | 5.0 | Food, Cafes, Coffee & Tea, Restaurants | PA | Good Karma Cafe | 419 | absolute favorite cafe city black white latte ... |
| 20 | HOLY SMOKES!\n\nactual pumpkin pie mixed in wi... | 5.0 | Ice Cream & Frozen Yogurt, Food, Local Flavor,... | MO | Ted Drewes | 249 | holy smoke actual pumpkin pie mixed frozen cus... |
| 21 | Upland is a brewery based out of Bloomington, ... | 3.0 | Nightlife, Food, Bars, Breweries, Pizza, Brewp... | IN | Upland Carmel Tap House | 736 | upland brewery based bloomington indiana becom... |
| 22 | I thoroughly enjoyed the show. Chill way to s... | 5.0 | Performing Arts, Arts & Entertainment, Nightli... | PA | The N Crowd | 66 | thoroughly enjoyed show chill way spend friday... |
| 23 | Yes, this is the only sushi place in town. How... | 4.0 | Restaurants, Sushi Bars | CA | Sushi Teri | 325 | yes sushi place town however great craving sus... |
| 24 | I was really between 3 and 4 stars for this on... | 4.0 | Restaurants, Food, Poke, Hawaiian, Sushi Bars | IN | Naked Tchopstix Express | 1555 | really 3 4 star one love 96th street naked tch... |
| 25 | Went for lunch. Beef brisket sandwich was awes... | 4.0 | American (New), Restaurants, Cocktail Bars, Ba... | IN | Barbecue and Bourbon | 110 | went lunch beef brisket sandwich awesome juicy... |
| 26 | Best thai food in the area. Everything was au... | 5.0 | Thai, Restaurants | PA | Thai Place Restaurant | 110 | best thai food area everything authentic delic... |
| 27 | Service was crappy, and food was mediocre. I ... | 3.0 | Cajun/Creole, Seafood, Restaurants, Breakfast ... | LA | Creole House Restaurant & Oyster Bar | 115 | service crappy food mediocre wish would picked... |
| 28 | I recently had dinner here with my wife over t... | 5.0 | Event Planning & Services, Italian, Venues & E... | PA | Anthony's at Paxon Hollow | 479 | recently dinner wife weekend could not_have pl... |
| 29 | I at least have to give this restaurant two st... | 2.0 | Cocktail Bars, Nightlife, Gastropubs, Sports B... | TN | Tavern | 473 | least give restaurant two star due decent food... |
| 30 | First time there and it was excellent!!! It fe... | 5.0 | Restaurants, Seafood, Cafes, Italian | PA | Portobello Cafe | 222 | first time excellent feel like entering someon... |
| 31 | Great burgers,fries and salad! Burgers have a... | 5.0 | Fast Food, Burgers, Restaurants | CA | The Original Habit Burger Grill | 209 | great burger fry salad burger hint salt pepper... |
| 32 | Great staff always helps and always nice. Alwa... | 5.0 | Food, Coffee & Tea, Gas Stations, Restaurants,... | PA | Wawa | 169 | great staff always help always nice always cle... |
| 33 | Took my vehicle here for some work a few years... | 5.0 | Auto Repair, Smog Check Stations, Auto Parts &... | NV | Landa Muffler & Brake | 384 | took vehicle work year ago manufacturer recall... |
| 34 | After my ROTD yesterday of a different Sweet ... | 4.0 | Food, Ice Cream & Frozen Yogurt | TN | Sweet Cece's | 490 | rotd yesterday different sweet cece location r... |
| 35 | What a great addition to the Funk Zone! Grab ... | 5.0 | Food, Restaurants, Salad, Coffee & Tea, Breakf... | CA | Helena Avenue Bakery | 222 | great addition funk zone grab bite grab tastin... |
| 36 | Nice relaxing place to get a massage! Same day... | 5.0 | Health & Medical, Beauty & Spas, Massage, Phys... | NV | Ralston Massage Center | 156 | nice relaxing place get massage day appointmen... |
| 37 | We checked in around 2:30 pm. Check-in was qu... | 4.0 | Event Planning & Services, Casinos, Beauty & S... | NV | Peppermill Reno | 1369 | checked around pm quick easy complimentary val... |
| 38 | My boyfriend and I tried this deli for the fir... | 5.0 | Restaurants, Delis, Salad, Sandwiches | PA | The Coventry Deli | 393 | boyfriend tried deli first time today turkey a... |
| 39 | Amazing biscuits and (fill in the blank). Grea... | 5.0 | American (New), Restaurants, American (Traditi... | TN | Milk and Honey Nashville | 101 | amazing biscuit fill blank great cocktail high... |
| 40 | Food was good- atmosphere/decor is like a fish... | 4.0 | Seafood, Steakhouses, Salad, Comfort Food, Res... | FL | Aquafinz | 222 | food like fishing menu someplace outback bonef... |
| 41 | Straight to the point, it's cheap, it tastes a... | 2.0 | American (New), Restaurants, Buffets, Breakfas... | NV | The Buffet | 621 | straight point cheap taste feel cheap good pri... |
| 42 | The only reason I didn't give this restaurant ... | 4.0 | American (New), Breakfast & Brunch, Bars, Nigh... | PA | Square 1682 | 2343 | reason not_give restaurant 5 star rating one s... |
| 43 | Stopped by after a Sunday morning walk in the ... | 5.0 | Bagels, Sporting Goods, Outdoor Gear, Coffee &... | TN | Three Brothers Coffee | 112 | stopped sunday morning walk park great food co... |
| 44 | In a word... "OVERRATED!". The food took fore... | 3.0 | Bars, Breakfast & Brunch, Restaurants, Barbequ... | LA | Mr. B's Bistro | 363 | word overrated food took forever come burger w... |
| 45 | Comfortable bed, good breakfast, fast internet... | 4.0 | Hotels, Event Planning & Services, Caterers, H... | PA | DoubleTree by Hilton Hotel Philadelphia Center... | 233 | comfortable bed good breakfast fast internet g... |
| 46 | NEVER AGAIN. This is a so called restaurant th... | 2.0 | Jazz & Blues, Bars, Arts & Entertainment, Beer... | LA | Bacchanal Fine Wine & Spirits | 590 | never called restaurant nothing restaurant pre... |
| 47 | If you want to pay for everything a la carte t... | 1.0 | American (New), Restaurants, Mexican | FL | El Chicanito Mexican Restaurant | 1016 | want pay everything la carte place food not_te... |
| 48 | The cafe was extremely cute. We came at 8am an... | 4.0 | Sandwiches, Breakfast & Brunch, Cajun/Creole, ... | LA | Cafe Beignet on Bourbon Street | 248 | cafe extremely cute came 8am even jazz band pl... |
| 49 | On a scale of one to things that are awesome, ... | 5.0 | Bars, Nightlife, Whiskey Bars, Burgers, Restau... | PA | Village Whiskey | 934 | scale one thing awesome place bomb drawn promi... |
# Save the DataFrame as a CSV file
new_df.to_csv('yelp_lemmatized_data.csv', index=False)
Model Training¶
# Create dataframe
df = pd.read_csv('yelp_lemmatized_data.csv')
df.dtypes
text object stars_x float64 categories object state object name object review_length int64 preprocessed_text object dtype: object
# EDA
# Distribution of Review Lengths by Sentiment considering a uniform distribution
import matplotlib.pyplot as plt
import seaborn as sns
# Filter positive and negative sentiment reviews
positive_reviews = df[df['stars_x'] >= 4]['review_length']
negative_reviews = df[df['stars_x'] <= 2]['review_length']
# Set up the figure and axes
plt.figure(figsize=(10, 6))
# Plot histogram for positive sentiment reviews
sns.histplot(positive_reviews, color='green', kde=True, label='Positive Sentiment')
# Plot histogram for negative sentiment reviews
sns.histplot(negative_reviews, color='red', kde=True, label='Negative Sentiment')
# Add labels and title
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.title('Distribution of Review Lengths by Sentiment')
plt.legend()
# Show plot
plt.show()
d:\Users\ndhu2\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
d:\Users\ndhu2\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
# Distribution of Review Lengths by Sentiment considering the real distribution
import matplotlib.pyplot as plt
import seaborn as sns
# Filter positive and negative sentiment reviews
positive_reviews = df[df['stars_x'] >= 4]['review_length']
negative_reviews = df[df['stars_x'] <= 2]['review_length']
# Set up the figure and axes
plt.figure(figsize=(10, 6))
# Plot histogram for positive sentiment reviews
sns.histplot(positive_reviews, color='green', kde=True, label='Positive Sentiment')
# Plot histogram for negative sentiment reviews
sns.histplot(negative_reviews, color='red', kde=True, label='Negative Sentiment')
# Add labels and title
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.title('Distribution of Review Lengths by Sentiment')
plt.legend()
# Show plot
plt.show()
d:\Users\ndhu2\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
d:\Users\ndhu2\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
# Distribution of Review Lengths by Sentiment considering a uniform distribution
# Set up the figure and axes
plt.figure(figsize=(10, 6))
# Create a box plot of review lengths by sentiment
sns.boxplot(data=df, x='stars_x', y='review_length')
# Add labels and title
plt.xlabel('Stars')
plt.ylabel('Review Length')
plt.title('Distribution of Review Lengths by Sentiment')
# Show plot
plt.show()
# Display the number of records with missing values
missing_values_count = df.isnull().sum()
print("Number of records with missing values:")
print(missing_values_count)
Number of records with missing values: text 0 stars_x 0 categories 0 state 0 name 0 review_length 0 preprocessed_text 238 dtype: int64
# Drop rows with missing values in the 'preprocessed_text' column
df = df.dropna(subset=['preprocessed_text'])
We chose TF-IDF instead of other techniques, such as CountVectorizer, because it not only considers the frequency of the words in just one document but also the frequency of the words across the corpus (all the documents). This can give an additional element to the model, allowing it to learn more from the weight that each word has from a more general perspective.
# Remove the rows with 3-star ratings
df = df[df['stars_x'] != 3]
X = df['preprocessed_text']
y = df['stars_x']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Undersampling Process
# Identify the size of the minority class
minority_class_size = y_train.value_counts().min()
# Undersample the majority classes to match the size of the minority class
undersampled_indices = pd.concat([
y_train[y_train == 1].sample(minority_class_size, replace=False),
y_train[y_train == 2].sample(minority_class_size, replace=False),
y_train[y_train == 4].sample(minority_class_size, replace=False),
y_train[y_train == 5].sample(minority_class_size, replace=False)
]).index
X_train_u = X_train.loc[undersampled_indices]
y_train_u = y_train.loc[undersampled_indices]
# Map ratings to binary labels (1 for good, 0 for bad)
y_train_u = y_train_u.map({1: 0, 2: 0, 4: 1, 5: 1})
y_test = y_test.map({1: 0, 2: 0, 4: 1, 5: 1})
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
# Define a pipeline with TF-IDF vectorizer and Bernoulli Naive Bayes classifier
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('nb', BernoulliNB())
])
# Define a range of min_df values to search
min_df = [1, 2, 4, 8, 12, 16, 20, 40, 60, 80, 160]
# Define parameters grid for grid search
param_grid = {
'tfidf__min_df': min_df,
}
# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_u, y_train_u)
# Get the best min_df value
best_min_df = grid_search.best_params_['tfidf__min_df']
print("Best min_df:", best_min_df)
# Train the model with the best min_df value
best_model = grid_search.best_estimator_
best_model.fit(X_train_u, y_train_u)
# Evaluate the best model
accuracy = best_model.score(X_test, y_test)
print("Accuracy:", accuracy)
Best min_df: 2 Accuracy: 0.8780249576655061
import matplotlib.pyplot as plt
# Extract grid search results
results = grid_search.cv_results_
# Extract mean test scores and corresponding max_features values
mean_test_scores = results['mean_test_score']
max_features_values = [params['tfidf__min_df'] for params in results['params']]
# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(max_features_values, mean_test_scores, marker='o')
plt.title('Mean Test Accuracy vs. Min Document Frequency')
plt.xlabel('Min Document Frequency')
plt.ylabel('Mean Test Accuracy')
plt.xticks(max_features_values)
plt.grid(True)
plt.show()
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=2)
# Fit and transform the preprocessed_text feature (X -> TF-IDF Matrix)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_u)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
import pickle
# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
pickle.dump(tfidf_vectorizer, file)
X_train_tfidf.shape
(1738088, 139023)
from sklearn.naive_bayes import BernoulliNB
# Initialize and train the Bernoulli Naive Bayes classifier
nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_tfidf, y_train_u)
BernoulliNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BernoulliNB()
# Predict sentiment on the test set
y_test_pred = nb_classifier.predict(X_test_tfidf)
# Predict labels for the training dataset
y_train_pred = nb_classifier.predict(X_train_tfidf)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[39], line 2 1 # Predict sentiment on the test set ----> 2 y_test_pred = nb_classifier.predict(X_test_tfidf) 4 # Predict labels for the training dataset 5 y_train_pred = nb_classifier.predict(X_train_tfidf) NameError: name 'X_test_tfidf' is not defined
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Evaluate the model
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)
# Calculate accuracy on the training dataset
train_accuracy = accuracy_score(y_train_u, y_train_pred)
print("Training Accuracy:", train_accuracy)
# Classification report
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
Test Accuracy: 0.8831434614090583
Training Accuracy: 0.8330199621653219
Classification Report:
precision recall f1-score support
0 0.77 0.78 0.77 321703
1 0.92 0.92 0.92 935548
accuracy 0.88 1257251
macro avg 0.85 0.85 0.85 1257251
weighted avg 0.88 0.88 0.88 1257251
import pickle
# Save the trained model
with open('Model3/nb_bin_model3.pkl', 'wb') as file:
pickle.dump(nb_classifier, file)
# Load the saved model
with open('Model3/nb_bin_model3.pkl', 'rb') as file:
loaded_classifier = pickle.load(file)
# Predict labels for the test dataset
y_test_pred = loaded_classifier.predict(X_test_tfidf)
# Calculate accuracy on the test dataset
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", test_accuracy)
# Predict labels for the training dataset
y_train_pred = loaded_classifier.predict(X_train_tfidf)
# Calculate accuracy on the training dataset
train_accuracy = accuracy_score(y_train_u, y_train_pred)
print("Training Accuracy:", train_accuracy)
Accuracy: 0.8780249576655061 Training Accuracy: 0.8266599408001176
from sklearn.metrics import confusion_matrix
# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Sentiment')
plt.ylabel('True Sentiment')
plt.show()
from sklearn.svm import SVC
from sklearn.metrics import classification_report
# Initialize SVM classifier
svm_classifier = SVC(kernel='linear', C=1.0) # Linear kernel with regularization parameter C
# Train the SVM model
svm_classifier.fit(X_train_tfidf, y_train_u)
# Predictions on the test set
y_test_pred = svm_classifier.predict(X_test_tfidf)
# Predictions on the training set
y_train_pred = svm_classifier.predict(X_train_tfidf)
# Evaluate the model
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)
# Calculate accuracy on the training dataset
train_accuracy = accuracy_score(y_train_u, y_train_pred)
print("Training Accuracy:", train_accuracy)
# Classification report
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
from sklearn.metrics import confusion_matrix
# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Sentiment')
plt.ylabel('True Sentiment')
plt.show()
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train_u)
# Convert text data to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_u)
X_test_seq = tokenizer.texts_to_sequences(X_test)
# Pad sequences to ensure they have the same length
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
# Assuming you have preprocessed your data and have X_train, y_train, X_test, y_test
# Define the embedding layer without weights
embedding_layer = Embedding(input_dim=10000, output_dim=16)
# Define RNN model
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print("X_train_pad shape:", X_train_pad.shape)
print("y_train shape:", y_train_u.shape)
print("X_test_pad shape:", X_test_pad.shape)
print("y_test shape:", y_test.shape)
X_train_pad shape: (1738088, 100) y_train shape: (1738088,) X_test_pad shape: (1257251, 100) y_test shape: (1257251,)
# Train the model
history = model.fit(X_train_pad, y_train_u, epochs=5, batch_size=128, validation_data=(X_test_pad, y_test))
Epoch 1/5 13579/13579 ━━━━━━━━━━━━━━━━━━━━ 357s 26ms/step - accuracy: 0.9316 - loss: 0.1723 - val_accuracy: 0.9657 - val_loss: 0.0941 Epoch 2/5 13579/13579 ━━━━━━━━━━━━━━━━━━━━ 354s 26ms/step - accuracy: 0.9607 - loss: 0.1055 - val_accuracy: 0.9670 - val_loss: 0.0872 Epoch 3/5 13579/13579 ━━━━━━━━━━━━━━━━━━━━ 362s 27ms/step - accuracy: 0.9652 - loss: 0.0940 - val_accuracy: 0.9709 - val_loss: 0.0798 Epoch 4/5 13579/13579 ━━━━━━━━━━━━━━━━━━━━ 363s 27ms/step - accuracy: 0.9678 - loss: 0.0872 - val_accuracy: 0.9730 - val_loss: 0.0745 Epoch 5/5 13579/13579 ━━━━━━━━━━━━━━━━━━━━ 367s 27ms/step - accuracy: 0.9702 - loss: 0.0817 - val_accuracy: 0.9692 - val_loss: 0.0831
# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy}")
_, train_accuracy = model.evaluate(X_train_pad, y_train_u)
print(f"Training Accuracy: {train_accuracy}")
39290/39290 ━━━━━━━━━━━━━━━━━━━━ 160s 4ms/step - accuracy: 0.9689 - loss: 0.0841 Test Accuracy: 0.9692217111587524 54316/54316 ━━━━━━━━━━━━━━━━━━━━ 239s 4ms/step - accuracy: 0.9793 - loss: 0.0615 Training Accuracy: 0.9735997319221497
# Save the model
model.save("lstm_model1.h5")
WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`.
# Extract the training and validation accuracy from the history
train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
# Plot the training and validation accuracy
plt.plot(train_accuracy, label='Training Accuracy')
plt.plot(val_accuracy, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy over Epochs')
plt.legend()
plt.show()
Model Interpretation¶
from lime.lime_text import LimeTextExplainer
import pickle
# Load the preprocessed test data
df_test = pd.DataFrame({'preprocessed_text': X_test, 'sentiment': y_test})
# Load the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
loaded_tfidf_vectorizer = pickle.load(file)
# Load the trained model from the pickle file
with open('C:/Users/ndhu2/Desktop/Term_3/6_Natural Language Processing/Project/Model3/nb_bin_model3.pkl', 'rb') as file:
nb_classifier = pickle.load(file)
# Define a function to classify text using the trained model
def nb_predict_proba(texts):
vectors = loaded_tfidf_vectorizer.transform(texts)
return nb_classifier.predict_proba(vectors)
# Initialize LIME TextExplainer
explainer = LimeTextExplainer(class_names=['negative', 'positive'])
# Choose a random instance from the test set for explanation
idx = np.random.randint(len(X_test))
text_instance = X_test.iloc[idx]
true_label = y_test.iloc[idx]
# Explain the prediction for the random instance
explanation = explainer.explain_instance(text_instance, nb_predict_proba, num_features=10, top_labels=1)
# Print the explanation
print('Text instance:', text_instance)
print('True label:', true_label)
print('Predicted label:', nb_classifier.predict(loaded_tfidf_vectorizer.transform([text_instance]))[0])
# Get the top features and their weights for the predicted label
top_features = explanation.as_list(label=explanation.top_labels[0])
# Print the top features and their weights
for feature, weight in top_features:
print(f"{feature}: {weight}")
# Show the explanation in the notebook
explanation.show_in_notebook(text=text_instance)
Text instance: friend recommended place excited go however poor service well subpar food highly disappointed server busy catering large table really not_pay attention husband ask glass water 4 time whole fish came 30 minute ordered table next u ordered 15 minute u got there time brussel sprout greasy not_even crispy reason giving place 2 not_1 star fish duck fat fry not_be coming True label: 0 Predicted label: 0 poor: 0.21160499439554004 subpar: 0.20413545671917072 not_even: 0.16699528039868544 not_pay: 0.1628039063187502 not_1: 0.16013969728307917 disappointed: 0.11251325186433776 minute: 0.11204368151556912 not_be: 0.11072357467954254 30: 0.10755731748425412 highly: -0.06834064308931294
import numpy as np
from lime.lime_text import LimeTextExplainer
import pickle
# Load the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
loaded_tfidf_vectorizer = pickle.load(file)
# Load the trained model from the pickle file
with open('C:/Users/ndhu2/Desktop/Term_3/6_Natural Language Processing/Project/Model3/nb_bin_model3.pkl', 'rb') as file:
nb_classifier = pickle.load(file)
# Define a function to classify text using the trained model
def nb_predict_proba(texts):
vectors = loaded_tfidf_vectorizer.transform(texts)
return nb_classifier.predict_proba(vectors)
# Initialize LIME TextExplainer
explainer = LimeTextExplainer(class_names=['negative', 'positive'])
# Input your review here
text_instance = "friend recommended place excited go however poor service well subpar food highly disappointed server busy catering large table really not_pay attention husband ask glass water 4 time whole fish came 30 minute ordered table next u ordered 15 minute u got there time brussel sprout greasy not_even crispy reason giving place 2 not_1 star fish duck fat fry not_be coming"
# Preprocess the text instance
preprocessed_text_instance = preprocess_text(text_instance)
# Explain the prediction for the preprocessed text instance
explanation = explainer.explain_instance(preprocessed_text_instance, nb_predict_proba, num_features=10, top_labels=1)
# Print the explanation
print('Text instance:', text_instance)
print('Text instance:', preprocessed_text_instance)
print('Predicted label:', nb_classifier.predict(loaded_tfidf_vectorizer.transform([text_instance]))[0])
# Get the top features and their weights for the predicted label
top_features = explanation.as_list(label=explanation.top_labels[0])
# Print the top features and their weights
for feature, weight in top_features:
print(f"{feature}: {weight}")
# Show the explanation in the notebook
explanation.show_in_notebook(text=preprocessed_text_instance)
Text instance: friend recommended place excited go however poor service well subpar food highly disappointed server busy catering large table really not_pay attention husband ask glass water 4 time whole fish came 30 minute ordered table next u ordered 15 minute u got there time brussel sprout greasy not_even crispy reason giving place 2 not_1 star fish duck fat fry not_be coming Text instance: friend recommended place excited go however poor service well subpar food highly disappointed server busy catering large table really not_pay attention husband ask glass water 4 time whole fish came 30 minute ordered table next u ordered 15 minute u got time brussel sprout greasy not_even crispy reason giving place 2 not_1 star fish duck fat fry not_be coming Predicted label: 0 poor: 0.22694478715492192 subpar: 0.19242713838680955 not_pay: 0.16451148406906657 not_1: 0.16102168894280453 not_even: 0.1532261083751255 30: 0.13008937176242705 minute: 0.12345846174484312 not_be: 0.11608615751599044 disappointed: 0.11568269403809768 highly: -0.07064456285448852
from lime.lime_text import LimeTextExplainer
import pickle
# Load the preprocessed test data
df_test = pd.DataFrame({'preprocessed_text': X_test, 'sentiment': y_test})
# Load the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
loaded_tfidf_vectorizer = pickle.load(file)
# Load the trained model from the pickle file
with open('C:/Users/ndhu2/Desktop/Term_3/6_Natural Language Processing/Project/Model3/nb_bin_model3.pkl', 'rb') as file:
nb_classifier = pickle.load(file)
# Define a function to classify text using the trained model
def nb_predict_proba(texts):
vectors = loaded_tfidf_vectorizer.transform(texts)
return nb_classifier.predict_proba(vectors)
# Initialize LIME TextExplainer
explainer = LimeTextExplainer(class_names=['negative', 'positive'])
# Aggregate important words
important_words = {}
# Iterate over at least 50 reviews
for idx in range(50):
# Choose a random instance from the test set for explanation
text_instance = X_test.iloc[idx]
true_label = y_test.iloc[idx]
# Explain the prediction for the random instance
explanation = explainer.explain_instance(text_instance, nb_predict_proba, num_features=10, top_labels=1)
# Get the top features and their weights for the predicted label
top_features = explanation.as_list(label=explanation.top_labels[0])
# Aggregate the important words
for feature, weight in top_features:
if feature in important_words:
important_words[feature] += weight
else:
important_words[feature] = weight
# Normalize the weights if needed
total_weight = sum(important_words.values())
important_words = {k: v/total_weight for k, v in important_words.items()}
# Sort the important words by weight
important_words = dict(sorted(important_words.items(), key=lambda item: item[1], reverse=True))
# Print or analyze the important words
for word, weight in important_words.items():
print(f"{word}: {weight}")
phone: 0.0685157773880253 email: 0.03604740988252881 paid: 0.0354131963833722 suck: 0.028149012817759195 called: 0.026233646484694697 excuse: 0.025800836269966848 inattention: 0.02570275023257392 administration: 0.024443757802595858 half: 0.02328105361903188 carpet: 0.02322912205927775 attitude: 0.02288640903290162 hallway: 0.022868995177190158 industry: 0.022521533251605354 never: 0.022193439642317035 soggy: 0.021641088610639597 experienced: 0.020981899709956436 said: 0.020795231916628985 min: 0.02021370604593308 disappointed: 0.018554123200798717 15: 0.018407337816177267 dropped: 0.017516243771648822 disrespectful: 0.017471681238841318 literally: 0.016798948665660264 hostess: 0.016682130974701122 changed: 0.015885289535455926 expiration: 0.015645001202966885 arrive: 0.014142613752375794 rude: 0.013617695173746908 could: 0.013599139839455227 disinfectant: 0.013538066041098381 inconsiderate: 0.012874705695045592 wipe: 0.012768224522495309 money: 0.012762778171706603 walmart: 0.012715772033434866 say: 0.012309599002702407 call: 0.012223168288785273 proceeded: 0.011586689930983187 ruined: 0.011339292251219388 100: 0.011232416821351609 eh: 0.01105863328833015 car: 0.011048374753406661 window: 0.01076026750340935 guess: 0.01053730123183656 left: 0.010453075269575136 hour: 0.010310874433751149 fail: 0.01024505593266344 comment: 0.010226298750247166 would: 0.010080674179301054 enjoyed: 0.010034445094516127 unfortunately: 0.009936573450198907 flu: 0.009790051233331617 refunded: 0.009575809216777682 fault: 0.009408671692091622 rant: 0.009350390961346523 stated: 0.00917779480840085 delicious: 0.008973338096866155 charge: 0.008683797165814074 stolen: 0.008674868248512423 fishy: 0.008463277850371867 going: 0.008436414824917568 response: 0.008344126541345313 extensive: 0.00819102705907891 perfect: 0.007690225789779001 contact: 0.007426294254051179 emailed: 0.007293908718269971 sorry: 0.006996833413161035 40: 0.00655045635547875 eviction: 0.006395983717344581 minute: 0.006329270416015321 another: 0.006205343737116793 asking: 0.006024228566726314 worse: 0.005992759886056528 appointment: 0.005747327760490766 50: 0.005715234628420314 forbid: 0.00556048697083014 canceled: 0.004926332582322096 fantastic: 0.004785472664770513 told: 0.004395616429337232 loved: 0.004291213121441303 somebody: 0.004247760114773865 understand: 0.004242298153325464 meaty: 0.0041894004475928285 bother: 0.004068377308423213 fresh: 0.002679939375884708 asked: 0.0025147510507486136 unwelcoming: 0.0020155888806332976 nigger: 0.001822524892665566 visibly: 0.0017707496994619084 tasty: 0.0016872853300274652 caucasian: 0.0016453484211849032 breakfast: 0.001574048345578634 outdated: 0.0015378199630342623 fuck: 0.0015186032086513758 bitch: 0.00148837011158734 embarrassed: 0.001442643722918538 uncomfortable: 0.001387866188874136 thanks: 0.001372725150520641 little: 0.0012931479962975552 cancelled: 0.000228155822599848 update: 0.00022114271549712554 advising: 0.00021409426558062554 post: 0.00020943003772807618 sadly: 0.00019583391079738884 spoke: 0.00019456986953886364 abruptly: 0.0001933551205657392 spa: 0.0001894985253882692 incredible: 0.00011116878126463624 great: 7.985895862032405e-05 vibe: 6.526236216291725e-05 pleasantly: 5.69036627905405e-05 comfortable: 4.163235963622531e-05 spacious: 4.1525285053902235e-05 adventure: 4.012596976004274e-05 modern: 3.300135054514177e-05 jawn: 2.9526635762029154e-05 yum: 2.8331284732003837e-05 good: 2.3184925114363835e-05 amazing: 1.7635005716509283e-05 raspberry: 1.7576813179436042e-05 knowledgeable: 1.6582819582198083e-05 town: 1.612617566749203e-05 everything: 1.5468477397719526e-05 instagram: 1.488256309505478e-05 atmosphere: 1.3506666560818322e-05 surprised: 1.2555938975828949e-05 crowded: 8.83712393828958e-06 flavorful: 8.649691870745958e-06 selection: 8.461496170813175e-06 parking: 8.356753752182271e-06 cleanest: 7.544238597917003e-06 generous: 7.4994652648128985e-06 offer: 7.046840335037891e-06 cigar: 7.038773488300331e-06 cocktail: 6.712936497147508e-06 die: 5.555120056690995e-06 fast: 5.55428094408912e-06 place: 5.4733502525905034e-06 awesome: 5.35509153066075e-06 relaxing: 5.287059395753344e-06 mary: 4.752121221936916e-06 deli: 4.088691028288867e-06 pudding: 3.903106885514056e-06 fabulous: 3.878837322278016e-06 pleased: 3.5533282763979564e-06 helpful: 3.2070409413710247e-06 champagne: 2.972861385369738e-06 plus: 2.8087983855605457e-06 orleans: 2.5860833189833933e-06 dat: 2.3902847973843277e-06 blackened: 2.1543418090877566e-06 flavor: 2.1473652843097682e-06 nice: 1.942773005631924e-06 alex: 1.7248692126332233e-06 super: 1.6709606149150086e-06 excellent: 1.6683069094798176e-06 unassuming: 1.5538693054070544e-06 mexican: 1.4639508196247593e-06 spot: 1.4597662821806646e-06 egg: 1.4506748227265227e-06 phenomenal: 1.3950114402615038e-06 helped: 1.3832256273683992e-06 refreshing: 1.2398753885540714e-06 robust: 1.1856900899899766e-06 mint: 1.1578551072057765e-06 cafe: 1.1508643523059103e-06 relaxed: 1.1136514784111252e-06 fave: 1.1019731986630542e-06 efficient: 1.0702145753397589e-06 free: 1.0313823358175282e-06 music: 9.374314210327356e-07 accommodating: 9.257100506772497e-07 gorgeous: 9.122611149084814e-07 rachel: 8.052448168176733e-07 team: 7.846923052474428e-07 neighborhood: 7.269136167262318e-07 pistachio: 6.95156332549067e-07 pup: 6.882503013696034e-07 socialization: 6.790000994183223e-07 freshest: 6.661099633392638e-07 bbq: 5.611532803302918e-07 luna: 5.600461308591414e-07 magic: 5.423153327083264e-07 cannolis: 5.408825457194764e-07 filling: 4.659662516910441e-07 visiting: 4.6238406884429605e-07 cannoli: 4.563852961465635e-07 cute: 4.2192895536105323e-07 perfectly: 3.935404534274366e-07 robyn: 3.8955477378292144e-07 outdoor: 3.632129523116086e-07 crowd: 3.5387983698575093e-07 verdura: 3.4045833576471894e-07 rabe: 3.1512473706957837e-07 roasted: 3.1164809779896456e-07 burrata: 3.0324822878091053e-07 handmade: 2.981970334496181e-07 seeded: 2.96388963681357e-07 knowledgable: 2.9585648129625284e-07 obsession: 2.831758794197096e-07 ingredient: 2.739605741031535e-07 brunch: 2.53706119676107e-07 pop: 2.487459689359421e-07 authentic: 2.449594209161656e-07 camp: 2.247856413300624e-07 recommendation: 2.2436883117646956e-07 glad: 2.2375020145729127e-07 brandy: 2.0902365905315557e-07 taco: 2.0248868823361895e-07 crisp: 1.9440864836014732e-07 pork: 1.9077708702894763e-07 quail: 1.7121464377966933e-07 cauliflower: 1.673076209510584e-07 mature: 1.4721538514271802e-07 apple: 1.373326879332264e-07 chop: 1.334548656935044e-07 farm: 1.1869244743321804e-07 fried: 8.681013357513295e-08 twinkling: 5.582334211426152e-08 smoked: 5.243234098472865e-08 whenever: 4.615029758900991e-08 huge: 4.503704855448237e-08 opted: 4.294842509884994e-08 nestled: 3.9187881015284615e-08 special: 3.824246605112361e-08 sehr: 3.723068657824672e-08 food: 1.4829301206004597e-08 drowned: 9.22703992445554e-09 nearly: 8.769351144981087e-09 counter: 7.0341336157727915e-09 something: 4.0775458515528605e-09 joke: -4.9606549563216225e-09 starting: -1.8387173991169526e-08 health: -9.534759689811138e-08 else: -9.898902986703521e-08 brought: -1.7924489807039915e-07 go: -2.0050643141464257e-07 day: -2.0965023888534188e-07 got: -2.1017035013397833e-07 14: -2.181240648366245e-07 check: -2.1887095110952963e-07 without: -2.600556071783093e-07 ignore: -2.7996575960575456e-07 long: -3.52199367808353e-07 solely: -3.608053128530278e-07 missed: -3.671222554646579e-07 trying: -3.9712856562284974e-07 skin: -4.6217152377063516e-07 container: -5.184999604368133e-07 based: -5.592618453427498e-07 away: -5.785891095005176e-07 complete: -6.105693428263065e-07 woman: -6.950046515706071e-07 wanted: -7.977945303153912e-07 training: -8.281916050299413e-07 work: -9.396489702086812e-07 quality: -1.01736404295172e-06 80: -1.062339123593751e-06 bland: -1.1601503277891734e-06 service: -1.1898880993357882e-06 gotten: -1.2934582601973758e-06 want: -1.3549582812421482e-06 buy: -1.455752095756881e-06 cash: -1.6036252106888743e-06 kitchen: -2.208137532402526e-06 explained: -2.243821797899918e-06 customer: -2.3676043158865408e-06 toooooo: -2.433736247306593e-06 anything: -2.693799248620165e-06 10: -2.9883773734722908e-06 spoiled: -3.402050132572048e-06 door: -3.4423959682408115e-06 twice: -3.984204253859772e-06 90: -4.7032868371827575e-06 waiting: -5.321203597463813e-06 please: -6.275078499391923e-06 time: -6.62302022803796e-06 stay: -7.267163027980131e-06 spend: -8.243539877443775e-06 deal: -8.254958474007789e-06 one: -8.894031380093509e-06 dunkin: -9.25838631687393e-06 hard: -9.281816998349412e-06 hotel: -9.49097296238249e-06 better: -9.587287213617943e-06 stepped: -1.0624570754605655e-05 later: -1.1954440402291877e-05 way: -1.2785971132295523e-05 given: -1.3239094011962047e-05 nothing: -1.3781537284890898e-05 employee: -1.6413380228167497e-05 problem: -2.007802361326408e-05 pick: -2.023325091933217e-05 ordered: -2.0506317142289972e-05 kno: -2.1872518937179815e-05 take: -2.266327986847173e-05 write: -2.3272297378818022e-05 brag: -2.3364098998436573e-05 expecting: -2.4474039995143885e-05 earlier: -2.5640084318040904e-05 logical: -2.5985462421336526e-05 encountered: -2.693958662503368e-05 approved: -2.8161991959679447e-05 temperature: -3.052076968921325e-05 entirely: -3.23676209251275e-05 security: -3.6357993280583676e-05 estimated: -3.7055073230609636e-05 leave: -4.1552635951755976e-05 clearly: -4.179230352701787e-05 forgettable: -4.5761567378273315e-05 ok: -0.0003813372216789702 received: -0.0005698087456068185 worth: -0.0008168081498281124 easy: -0.0009273584374386028 single: -0.001209269132780663 basic: -0.0012224408849066401 get: -0.0012439323132689443 ago: -0.0013294467676720101 even: -0.0013575886258995215 best: -0.001370568130493848 review: -0.0014167989987409412 compassionate: -0.0014397453837893491 two: -0.0014741023927823632 homey: -0.001475653535963959 possible: -0.001531607258712582 manager: -0.0015604561228142288 complained: -0.0016249007173343214 issue: -0.0016454559511670148 turned: -0.0016724924423614225 noone: -0.0017910461787501452 give: -0.0018264747585328594 order: -0.0018409913037869734 computer: -0.001946157886845705 kick: -0.001959121352959343 desk: -0.002060553129364634 ask: -0.0024458369609634645 friendly: -0.002523725102678545 sprite: -0.002993940948672193 um: -0.0031710994397280297 wonderful: -0.004123746702448322 love: -0.004148046260369466 highly: -0.004535849430192918 recommend: -0.005086408401837509 missing: -0.005799792699334027 instead: -0.006102141275695025 always: -0.006808301014697158 posted: -0.007015931790672499 locally: -0.00834393240020323 approachable: -0.008785900064654609 dramatically: -0.010127012933660987 outstanding: -0.010185368877089162 calling: -0.010324286662243231 definitely: -0.010931056369454399 favorite: -0.011765854892737381
nb_predict_proba([X_test.iloc[2]])
array([[2.92973824e-07, 9.99999707e-01]])
lstm_predict_proba
array([[0.05784018]], dtype=float32)
X_test.shape
(1257251,)
from lime.lime_text import LimeTextExplainer
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Assuming X_train, y_train, X_test, y_test are your training and test data
# Assuming tokenizer is your text tokenizer
# Assuming max_sequence_length is the maximum length of your padded sequences
# Load the trained model from the h5 file
lstm_model = tf.keras.models.load_model("lstm_model1.h5")
# Choose a random instance index from the test set for explanation
idx = np.random.randint(len(X_test))
text_instance = X_test.iloc[idx]
# Tokenize and pad the text instance
text_instance_seq = tokenizer.texts_to_sequences([text_instance])
text_instance_pad = pad_sequences(text_instance_seq, maxlen=100)
# Define the predict_proba_fn function
def predict_proba_fn(texts):
sequences = tokenizer.texts_to_sequences(texts)
sequences_pad = pad_sequences(sequences, maxlen=100)
return lstm_model.predict(sequences_pad)
# Initialize LIME TextExplainer
explainer = LimeTextExplainer(class_names=['negative', 'positive'])
# Explain the prediction for the random instance
explanation = explainer.explain_instance(text_instance, predict_proba_fn, num_features=10, top_labels=1)
# Print the explanation
print('Text instance:', text_instance)
print('True label:', y_test.iloc[idx])
print('Predicted label:', np.argmax(lstm_model.predict(text_instance_pad)))
explanation.show_in_notebook(text=text_instance)
WARNING:absl:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.
157/157 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step Text instance: dinner last night pleasantly surprised since read negative review place beforehand I got say not_clue talking burger great cake even better staff friendly supervisor even went table table making sure everyone good dining experience rarely see anymore place definitely back True label: 1 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step Predicted label: 0
1
# Assuming X_test is your test data and y_test is the corresponding labels
# Choose 50 random indices from the test set
indices = np.random.choice(len(X_test), size=50, replace=False)
# Iterate over the selected indices and predict the sentiment for each record
for idx in indices:
text_instance = X_test.iloc[idx]
text_instance_seq = tokenizer.texts_to_sequences([text_instance])
text_instance_pad = pad_sequences(text_instance_seq, maxlen=100)
# Predict the sentiment
sentiment_prob = lstm_model.predict(text_instance_pad)[0]
predicted_sentiment = 'positive' if sentiment_prob[0] > 0.5 else 'negative'
true_sentiment = 'positive' if y_test.iloc[idx] == 1 else 'negative'
# Print the prediction
print('Text:', text_instance)
print('True sentiment:', true_sentiment)
print('Predicted sentiment:', predicted_sentiment)
print()
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 21ms/step Text: scheduled 8am called say coming 8am 9am professional courteous wanted would definitely recommend others True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step Text: family 4 made reservation monday evening restaurant 25 full immediately seated left sitting table 17 minute without ever someone come acknowledge sat not_drink order not_water nothing many good option area spend money place show lack consideration customer True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 17ms/step Text: ambiance wonderful brass unique seat think traditionally dressed staff food average though went dinner got good assortment food dish bit lacking flavor profile lamb iskender gyro meat not_wow portion nicely sized though lot filler item plate fry fried bread meat dish made think twice price quantity overall may go back couple month operation new place perhaps grow better another turkish place louis still heart plus right across street great frozen custard place True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step Text: well I officially clayton resident new place minute walking distance combo kaldi quickly becoming morning office not_express happy make able wake short walk away good breakfast awesome coffee free topic one complaint hour limit sure give new username password free need next hour happens middle big project wireless time run major frustration let tell not_extend little longer food enough satisfy new favorite thing order morning quiche day yesterday served breakfast potato sweet potato regular banana perfect cup cinnamon hazelnut coffee topic banana random little mom would always slice banana put bowl milk give spoon anyone else really diverse crowd morning fun people watch catching news drinking coffee actually ideal way spend every single morning chance stop see banana milk course True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step Text: anniversary dinner place great say not_hardly imagine anyone could score four star start service outstanding justin clearly take profession seriously family plenty time chat course coming neither rush slow literally right came time recommendation encouraged u provided additional information led extraordinary dining selection salmon impeccable beef even better several good restaurant fine main course eastland raised bar even higher pairing main course excellent vegetable eating enjoying part meal realized big portion wife decided order side item omg green chile mac cheese fried green tomato also superb nothing frozen not_shame u asked doggie bag meal great I sure next serving good longtime nashvillian I bit embarrassed family discovered eastland cafe food atmosphere service price made experience remember quite time True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step Text: first food overall get 5 star service little slow place not_overly busy would star could beverage pretty boy spicy watermelon margarita wonderful neither sweet pretty boy definitely stronger kick watermelon marg enough spice slight heat without making drink hot enjoy fruity flavor apps kept simple rolled chip sauce trio tres amigo salsa honestly star roast tomato wood fire oven smokey flavor definitely pull order queso guacamole well meal mom enchilada opted spinach mushroom went chorizo exceptional however agreed spinach option better lime used really pulled overall dish super light comparison expected pleasant surprise great vegetarian option know not_pictured street corn could not_wait try one favorite food devoured could snag picture personally could smidge heat bowl empty left know worth checking definitely splitting item menu still explore area drink desired True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 21ms/step Text: I traveling greenwood last year work thought starbucks coffee town today coffee shop go great menu friendly baristas polite knowledgeable reasonably priced coffee make great flat white also lot room sit work True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 19ms/step Text: I glad took drive shop search kayak done research still unsure kayak would fit need use scott helpful asking question listening need installed crossbar vehicle showed proper way strap kayak would highly recommend anyone looking purchase kayak canoe stop not_be disappointed True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step Text: enjoyed time effie dining going kimmel center block northwest not_know byob informed like glass wine dinner wanted get full effie experience went may gone bit overboard appetizer got spanakopita far favorite table also got fried zucchini chip came amazing dipping sauce calamari pretty good came really key lemon slice marinara ended relatively full first course soldiered boyfriend got gyro platter really enjoyed meat pea think not_too appetizing overshadowed main attraction mom shared mixed grill ultimate greek platter gyro sausage lamb shank lamb kabob chicken kabob might came fry could not_eat one decent tzatziki tiniest little greek salad ever seen server sweet sure fill water glass food came quickly door plenty time theater come back I bringing wine going plan stay savor thing even bring course True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 24ms/step Text: good service excellent ordered philly chicken sandwich also ordered egg plant fry delicious True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step Text: reached gate 1 travel hope could assist upcoming trip croatia attending wedding croatia hotel arrangement 2 night trip looking go 10 day already something arranged two night trip said sorry not_help rude never waste time reaching thanks nothing gate 1 True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step Text: went open 3 day drink pollo panni focscia bread al good went back week later ordered came hamburger bun rip ordered johnny walker black rock charged drink went back today lunch not_lunch bar menu anymore waitress said new menu two salad one beer coffee foodie dine central south jersey well philly place horrible go rt 130 little bring bottle wine fratelli super food reasonable priced avoid portico bad deal True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step Text: new favorite sandwich place santa barbara highly coveted position really feel like sandwich worth money atmosphere fun owner hysterically funny star would highly recommend True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step Text: looking forward trying fresh food pretty disappointed place food drink barely mediocre overall atmosphere pretty stale boring service great waiter helpful answering question went small group girl check not_separated found odd inconvenient not_be returning True sentiment: positive Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step Text: friend went percy day first time say food great cornbread good full slab rib delicious cocktail much needed drink hot day course try award winning pecan pie okay okay try see good mine good pie competition although pie die not_pleased cost ice cream would think would come oh delicious pecan pie digress would recommend everyone try place I back True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 17ms/step Text: love first experience ok go food always fresh meat tender delicious True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step Text: excellent walking stick look gorgeous perfect trip london True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 21ms/step Text: would definitely give food cafe abyssinia five even star love veggie combo eaten ethiopian many time food rank among favorite restaurant homey feel almost like eating someone private residence nice sit outside unfortunate car park right next patio occasionally smell exhaust fume eat otherwise I huge fan byob nice True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step Text: forgotten local took whole fish menu truly tourist attraction miss u season True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 15ms/step Text: dr lundeen work right would normal person want result surgery repair left dr lundeen not_doing surgery True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step Text: tasty food wonderful atmosphere thing wished marquee 9 appetizer deliciously sweet pineapple avocado dip plantain chip would gone little bit everything else tried sufficient portion dinner menu includes several cuban sandwich breakfast choice recommend calle ocho sandwich pickle roast pork side house salad made fresh dressing one highlight visit handmade drink beautifully presented bar new restaurant often crowded 6 9pm definitely get reservation True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 29ms/step Text: heard place friend dropped car within hour done I detailing place place really get done high standard not_corners cut I not_do deep inspection car could tell getting car back immaculate job fair price nice people great service True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step Text: worst wendy ever gone rude terrible food scared eat meal hope never visit wendy True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 16ms/step Text: saturday afternoon decided needed haircut new look not_appointment walked scout thrilled kendra not_longer cutting hair skylar young lady assigned taken photo past wanted cut not_only listen wanted spent good amount time getting right went long hair decided time get style instead wearing time happy definitely going back scout specifically ask skylar thank True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 31ms/step Text: poor quality hotel hilton not_be back level customer service could lot better shame management general manager not_respond review online kinda make wonder care hotel anyway chose another hotel not_much around hotel travel everywhere nothing see nearby True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step Text: best lentil soup I ever life must like spicy little kick come back husband town True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step Text: priced basic seasoned food cocktail fine food limited way priced 35 2 piece chicken come want tell friend got avoid want good meal True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 15ms/step Text: definitely not_have good time came late night cold could not_enjoy view staff working winery daughter bad vibe staff not_that friendly excitable wine tasting bottle understand close end night ambiance not_good not_one really check either kind hassle return bar time get next tasting hoping seat not_get jacked hopefully staff work customer service friendliness help marketing retaining customer location thing would make return True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step Text: joshua server amazing informative eaten sushi made experience eau exciting food delicious pic best sushi I indy dessert world not_wait go back True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 15ms/step Text: not_good hotel could better better service checked late got passed inquire everything bed stand employee shuttle driver michael turned around introduced offered everyone water cookie standard not_the exception True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 29ms/step Text: tour great way see different side new orleans great tour guide knew city well kept everything fun big plus bike super comfortable new orleans street highly recommend tour True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step Text: tommy gunns look would never expect little shack ridge ave would pack huge punch barbecue flavor introduced gunns way back roommate manged place always blessed leftover deep fried mac cheese pork brisket sandwich course corn bread I sandwich pork brisket never better bbq side amazing must visit place higher end term cost worth mood True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step Text: tried lobster ravioli nice flavor really not_lobster seemed like regular ravioli lobster flavoring little disappointing giving 4 star good service alfredo dipping sauce good breadstick True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step Text: definitely best southern food I usually judge place meat green point kinda divey side murf rd lavernge well worth True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step Text: shrimp oyster et tu fe oyster raw grilled particularly liked grilled cooked enough cook cheese top not_kill oyster cajun shrimp et tu fe outstanding overall may lack timely service may bit touristy make large portion especially side great happy hour friendly staff nice balcony True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step Text: not_idea anything 1 start not_even describe order messed pizza cold wing gooey disgusting grab pizza costco deli heat avoid terrible experience True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 24ms/step Text: exton ups store made usual madness holiday shipping surprisingly easy delightful aside 5 minute wait cold safety prevents many customer inside time super easy young woman helped jazlyn simply best cheerful helpful sweet snuck right closing although clear slammed day friendly kind huge kudos ups store not_all go way support level customer service True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step Text: food great recommend place anyone customer service impeccable julie great service point impressed return get back new orleans True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step Text: thank joy jose maintainence helping u transition chestnut hill village apartment initial issue unit solved thorough professional manner pleased result townhouse truly becoming home seeking True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 30ms/step Text: made trek best german restaurant il good ever read previous review food consistently good menu not_changed not_need change german beer solid german cuisine prepared served really nice people reasonable price could one ask previous review accurate originally wrote least 10 time since read date food u True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 30ms/step Text: breakfast not_i would like cafe con leche sorry machine broken I philly cheesesteak ok please add mushroom sorry not_mushrooms wtf first last visit food not_come yet please pray u True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 16ms/step Text: good food simple atmosphere store layout service quick efficient food prepared well seasoned sauced good thai food without americanized True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 16ms/step Text: recently bought townhouse reno quickly realized get warmer used elected ceiling fan put every room turned trusty friend yelp requested several estimate I not_sure majority smoking recreational marijuana price lead time crazy olectric exception based positive yelp review decide give shot dale justin showed time ready work walked job pointed potential issue got guy not_weekend warrior selfers pro minimized potential issue great job budget ka ching would not_hesitate use recommend thanks olectric great job True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step Text: iced chai tea latte not_on menu not_stop making one ask make exactly like level sweetness want milk spiciness try adjust mmmmmm drained not_time flat staff friendly helpful nothing seems range decor fun ton look inviting comfy place little way totally fit neighborhood neighboring shop great place exploring part town True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 22ms/step Text: went drive thru cappuccino think person window name bryce say energy booboo friendly True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step Text: veterinarian vet technician phenomenal seemed truly passionate job friendly gentle nervous dog took right away concern health one best vet I ever whenever know pet good hand True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step Text: visiting nola week stopped never expecting food amazing pimento cheese biscuit biscuit amazing sausage hint spice not_too spicy also ordered peanut butter hot chocolate must chocolate lover think liquid peanut butter cup better True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 15ms/step Text: crappy service one 1 person workin counter phone ur bos get get discount cause not_have authority b u employee give permission speed bit 20 min get coherent tube ridiculous True sentiment: negative Predicted sentiment: negative 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step Text: rude tsa agent make beautiful easily navigate otherwise florida airport lose full star say rude mean yell face not_listen kindly trying explain issue russian roulette security screening may nice may ruin day horribly get tsa pre save trouble aside likely one top 3 favorite airport united state america great layout great charging station everywhere even cool airport started installing decent dining option great connection international city many direct flight offered domestic destination layover connecting flight usually make sense hope not_change recent expansion renovation improving seems accommodate traffic yet retain current state affair great location close go beautiful tampa bay beach petersburg easy access downtown tampa well speaking car several rent car company claim part huge parking garage stone throw elevator terminal baggage claim got someone picking dropping not_problem first hour parking also free I always happy land depart tpa dandy dandy tsa pre unless american airline ruin wiping account asked second last name added magically disappeared account morning grin True sentiment: positive Predicted sentiment: positive 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 19ms/step Text: cheap sandwich good topping choice pick bread ton bakery option like served cool unless come night kind stale picked looked like decent grocery also low priced decent sandwich good spot check True sentiment: positive Predicted sentiment: positive